drug_overdose = read_csv("./data/VSRR_Provisional_Drug_Overdose_Death_Counts.csv") %>%
janitor::clean_names()
unique(pull(drug_overdose, state))
## [1] "AK" "AL" "AR" "AZ" "CA" "CO" "CT" "DC" "DE" "FL" "GA" "HI" "IA" "ID" "IL"
## [16] "IN" "KS" "KY" "LA" "MA" "MD" "ME" "MI" "MN" "MO" "MS" "MT" "NC" "ND" "NE"
## [31] "NH" "NJ" "NM" "NV" "NY" "OH" "OK" "OR" "PA" "RI" "SC" "SD" "TN" "TX" "US"
## [46] "UT" "VA" "VT" "WA" "WI" "WV" "WY" "YC"
# contains 50 states, Washington DC, whole US, and the City of New York.
state_level = c(state.name[1:8], "District of Columbia", state.name[9:32],"New York City", state.name[33:50])
Tidy data:
drug_overdose_52 =
drug_overdose %>%
filter(!(state_name %in% c("United States"))) %>%
relocate(state_name) %>%
mutate(month = factor(month, levels = month.name), # change month and year to factor
year = factor(year),
state_name = factor(state_name, levels = state_level)) %>%
arrange(state_name) %>%
group_by(state_name, year) %>%
mutate(month = sort(month)) # sort by month order
Number of Deaths, Percent with drugs specified, and Number of Drug Overdose Deaths.drug_overdose_death =
drug_overdose_52 %>%
select(-c( footnote_symbol, percent_complete, period, percent_pending_investigation, predicted_value)) %>%
filter(indicator %in% c("Number of Deaths", "Percent with drugs specified", "Number of Drug Overdose Deaths"))
drug_overdose_death
## # A tibble: 11,856 × 7
## # Groups: state_name, year [364]
## state_name state year month indicator data_value footnote
## <fct> <chr> <fct> <fct> <chr> <dbl> <chr>
## 1 Alabama AL 2015 January Number of De… 50565 Numbers may differ …
## 2 Alabama AL 2015 January Percent with… 49.3 Numbers may differ …
## 3 Alabama AL 2015 January Number of Dr… 738 Numbers may differ …
## 4 Alabama AL 2015 February Percent with… 46.9 Numbers may differ …
## 5 Alabama AL 2015 February Number of Dr… 736 Numbers may differ …
## 6 Alabama AL 2015 February Number of De… 50930 Numbers may differ …
## 7 Alabama AL 2015 March Number of Dr… 720 Numbers may differ …
## 8 Alabama AL 2015 March Number of De… 50870 Numbers may differ …
## 9 Alabama AL 2015 March Percent with… 49.6 Numbers may differ …
## 10 Alabama AL 2015 April Number of Dr… 723 Numbers may differ …
## # … with 11,846 more rows
Number of Drug Overdose Deaths is predicted since the Percent with drugs specified is not 100%.footnote categorization:
#unique(pull(drug_overdose_51, footnote))
drug_overdose_52 =
drug_overdose_52 %>%
mutate(low_data_quality = ifelse(str_detect(footnote, "low data quality"), 1, 0), # data_value not shown, predicted yes?
suppressed = ifelse(str_detect(footnote, "suppressed"), 1, 0),
underreported = ifelse(str_detect(footnote, "Underreported"), 1, 0)) %>%
relocate(footnote, .after = last_col())
drug_categories =
drug_overdose_52 %>%
ungroup() %>%
select(-c(state, footnote_symbol, percent_complete, period, percent_pending_investigation, footnote, predicted_value)) %>%
filter(str_detect(indicator, "T4"))
drug_categories
## # A tibble: 29,412 × 8
## state_name year month indicator data_value low_data_quality suppressed
## <fct> <fct> <fct> <chr> <dbl> <dbl> <dbl>
## 1 Alaska 2015 January Natural & s… NA 1 0
## 2 Alaska 2015 January Natural & s… NA 1 0
## 3 Alaska 2015 January Natural, se… NA 1 0
## 4 Alaska 2015 January Synthetic o… NA 1 0
## 5 Alaska 2015 January Heroin (T40… NA 1 0
## 6 Alaska 2015 January Methadone (… NA 1 1
## 7 Alaska 2015 January Opioids (T4… NA 1 0
## 8 Alaska 2015 January Cocaine (T4… NA 1 0
## 9 Alaska 2015 January Psychostimu… NA 1 0
## 10 Alaska 2015 February Natural & s… NA 1 0
## # … with 29,402 more rows, and 1 more variable: underreported <dbl>
# missing states' data:
miss_states =
drug_overdose_52 %>%
ungroup() %>%
select(state_name) %>%
unique() %>%
filter(!(state_name %in% drug_categories$state_name))
miss_states
## # A tibble: 9 × 1
## state_name
## <fct>
## 1 Alabama
## 2 Arkansas
## 3 Florida
## 4 Idaho
## 5 Louisiana
## 6 Minnesota
## 7 Nebraska
## 8 North Dakota
## 9 Pennsylvania
drug_type_plot =
drug_overdose %>%
filter(state %in% c("US")) %>%
filter(!(indicator %in% c("Number of Deaths", "Number of Drug Overdose Deaths", "Percent with drugs specified")))%>%
relocate(state) %>%
mutate(month = factor(month, levels = month.name), # change month and year to factor
year = factor(year)) %>%
arrange(state) %>%
group_by(state, year) %>%
mutate(month = sort(month)) %>%
ggplot(aes(x = month, y = data_value,color = indicator)) +
geom_point()+
facet_grid(~year)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.4, hjust = 1))
drug_type_plot
- missing data in 9 states: Alabama, Arkansas, Florida, Idaho, Louisiana, Minnesota, Nebraska, North Dakota, Pennsylvania.
which states have highest death in each year option1: maybe this one better
overview_year =
drug_overdose %>%
filter(indicator == c("Number of Deaths", "Number of Drug Overdose Deaths")) %>%
select("state", "year", "month", "indicator", "data_value") %>%
filter(!(state == "US")) %>%
filter(str_detect(indicator, "Drug Overdose Deaths")) %>%
group_by(state, year, indicator) %>%
summarize(data_value = sum(data_value))
## `summarise()` has grouped output by 'state', 'year'. You can override using the `.groups` argument.
overview_year_plot =
overview_year %>%
ggplot(aes(x = year, y = data_value, color = state)) +
geom_point() +
geom_line()+
theme_set(theme_minimal() + theme(legend.position = "bottom")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.4, hjust = 1))
overview_year_plot
option2:
drug_overdose_52 %>%
filter(str_detect(indicator, "Drug Overdose Deaths")) %>%
mutate(state_name = factor(state_name, levels = state_level)) %>%
select(state_name, year, month, data_value) %>%
arrange(state_name, month) %>%
ungroup() %>%
ggplot(aes(y = data_value, x = month, color = state_name)) +
geom_point() +
geom_line(aes(group = state_name))+
facet_grid(~year)+
theme_set(theme_minimal() + theme(legend.position = "bottom")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.4, hjust = 1))
trend_year_plot =
drug_overdose %>%
filter(indicator == c("Number of Deaths", "Number of Drug Overdose Deaths")) %>%
select("state", "year", "month", "indicator", "data_value") %>%
filter(!(state == "US")) %>%
group_by(state, year, indicator) %>%
summarize(data_value = sum(data_value)) %>%
filter(str_detect(indicator, "Drug Overdose Deaths")) %>%
ggplot(aes(x = year, y = data_value)) +
geom_smooth(se = FALSE) +
theme_set(theme_minimal() + theme(legend.position = "bottom")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.4, hjust = 1))
## `summarise()` has grouped output by 'state', 'year'. You can override using the `.groups` argument.
trend_year_plot
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
2015 vs. 2021
sum_df =
drug_overdose_death %>%
ungroup() %>%
filter(indicator %in% c("Number of Deaths", "Number of Drug Overdose Deaths")) %>%
select(state_name, year, month, indicator, deaths = data_value) %>%
pivot_wider(
names_from = indicator,
values_from = deaths
) %>%
janitor::clean_names()
plot_df = sum_df %>%
group_by(state_name, year, month) %>%
ungroup() %>%
group_by(state_name, year) %>%
summarize(number_of_total_deaths = sum(number_of_deaths),
number_of_total_drug_deaths = sum(number_of_drug_overdose_deaths)) %>%
mutate(percent_overdose_death = number_of_total_drug_deaths / number_of_total_deaths) %>%
filter(year == c(2015, 2020)) %>%
ungroup()
## `summarise()` has grouped output by 'state_name'. You can override using the `.groups` argument.
plot_df
## # A tibble: 104 × 5
## state_name year number_of_total_deaths number_of_total_dr… percent_overdose…
## <fct> <fct> <dbl> <dbl> <dbl>
## 1 Alabama 2015 607641 8818 0.0145
## 2 Alabama 2020 682214 10624 0.0156
## 3 Alaska 2015 50086 1472 0.0294
## 4 Alaska 2020 55824 1591 0.0285
## 5 Arizona 2015 642297 14590 0.0227
## 6 Arizona 2020 810432 28466 0.0351
## 7 Arkansas 2015 368117 4616 0.0125
## 8 Arkansas 2020 407063 5325 0.0131
## 9 California 2015 3056952 56001 0.0183
## 10 California 2020 3456728 92987 0.0269
## # … with 94 more rows
p = ggplot(data = plot_df, aes(x = percent_overdose_death, y=reorder(state_name, percent_overdose_death)))+
geom_line(aes(group = state_name), alpha = 0.5)+
geom_point(aes(color = year), alpha = 0.6, size = 4)
ggplotly(p)
data_2 =
sum_df %>%
group_by(year, month) %>%
arrange(year, month, -number_of_drug_overdose_deaths) %>%
mutate(rank = 1:n(),
month = match(month, month.name),
date = str_c(year,"-",month),
date = as.Date(paste(date, "-01", sep =''))) %>%
filter(rank<=20)
data_2
my_plot =
data_2 %>%
ggplot()+
aes(xmin = 0,
xmax = number_of_drug_overdose_deaths) +
aes(ymin = rank - 0.45,
ymax = rank + 0.45,
y = rank) +
facet_wrap(~ date) +
geom_rect(alpaha = 0.7) +
aes(fill = state_name)+
scale_fill_viridis_d(option = "magma",
direction = -1) +
scale_x_continuous(
limits = c(-1000, 12000),
breaks = c(-1000, 0, 3000, 6000,9000,12000)
)+
geom_text(col = "darkblue",
hjust = "right",
aes(label = state_name),
x = -100)+
geom_text(col = "darkblue",
hjust = "right",
aes(label = paste(number_of_drug_overdose_deaths),x=12000))+
scale_y_reverse()+
labs(fill = NULL)+
labs(x = "Death cases")+
labs(y = "States")+
theme(legend.position = "none")
p = my_plot +
facet_null()+
geom_text(x = 8000, y = -20,
family = "Times",
aes(label = as.character(date)),
size = 10, col = "black")+
aes(group = state_name)+
transition_time(date)
#animate(p, nframes=350,fps=5,width=1000)
gif = animate(p,nframes = 700, fps=15,width=1000,height=700,
renderer = gifski_renderer())
gif
anim_save("animation.gif", animation = gif)